package com.kdtech.analyse.news;
import com.kdtech.analyse.AnalyseNews;
import com.kdtech.utils.HtmlCleaner;

import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import com.kdtech.analyse.JSoupUtils;
import com.kdtech.analyse.ParseState;
import com.kdtech.crawler.CrawlHTML;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.HtmlCleaner;

public class CnrNewsAnalyse implements AnalyseNews {

	
	public boolean isDetailPage(String url) {
		// http://cnr.cn/gundong/201211/t20121123_511393396.shtml
		// http://auto.cnr.cn/news/storys_19877.html
		// http://auto.cnr.cn/news/20121120/story_251629.html
		// http://tech.cnr.cn/jdxw/201211/t20121123_511396094.html
		boolean bRet=false;
		String[] regex={
				"http://.*.cnr.cn/.*/t[0-9]{8}_[0-9]*_[a-z]*.html.*",
				"http://.*cnr.cn/.*[0-9]{6}/t[0-9]{8}_[0-9]*.[s]{0,1}html",
				"http://auto[.]cnr[.]cn/news/.*story[s]{0,1}_[0-9]*.html"};
		for (int i=0; i < regex.length; i++) {
			if (url.matches(regex[i])) {
				bRet=true;
				break;
			}
		}
		return bRet;
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {
		String html=urlMeta.getHtml();
		if(!isDetailPage(urlMeta.getUrl())){
			return null;
		}
		String title="";
		String content="";
		;
		Long date=null;
		int commentNum=0;
		int clickNum=0;
		NewsMeta meta=new NewsMeta();
		String url=urlMeta.getUrl();
		meta.setUrl(url);
		Document doc=Jsoup.parse(html);
		urlMeta=null;
		/*
		 * 解析新闻标题
		 */
		title=doc.select("div.hei").text();
		if (StringUtils.isBlank(title)) {
			title=doc.select("h2.hei").text();
			if (StringUtils.isBlank(title)) {
				title=doc.select("div.ina_news_text>h1>p").text();
				if (StringUtils.isBlank(title)) {
					title=doc.select("td.tbt").text();
					if (StringUtils.isBlank(title)) {

						if (StringUtils.isBlank(title)) {
							title=doc.select("p.f22").text();
						}
						if (StringUtils.isBlank(title)) {
							title=doc.select("title").text();
							title=StringUtils.substringBefore(title, " - ");
						}
						if (StringUtils.isBlank(title)) {
							Elements select=doc.select("p.txtcenter");
							if(select.size()>1){
								title=select.get(0).text();
							}
						}
					}
				}
			}
		}

		if(StringUtils.isNotBlank(title)&&title.equals("中国广播网--404页面")){
			meta.setUpdateUrl(ParseState.ERR404.toString());
			return meta;
		}
		/*
		 * 解析时间
		 */
		date=DateUtils.matchDate(doc.select("div.lh26 ").text());
		if (date == null) {
			date=DateUtils.matchDate(doc.select("div.ina_news_text>h1")
					.text());
			if (date == null) {
				date=DateUtils.matchDate(doc.select("p.lh26")
						.text());
				if(date==null){
					date=DateUtils.matchDate(doc.select("span#pubtime_baidu").text());
					if(date==null){
						Elements select=doc.select("p.txtcenter");
						if(select.size()>1){
							date=DateUtils.matchDate(select.get(1).text());
						}
						if(date==null){
							date=DateUtils.matchDate(doc.select("span.lh20").text());
							if(date==null){
								date=DateUtils.matchDate(url);
							}
						}
					}
				}
			}
		}

		/*
		 * 解析内容
		 */

		content=HtmlCleaner.getContentHtml(url,doc.select("div.f12_5a5a5a"));
		if (StringUtils.isBlank(content)) {
			content=HtmlCleaner.getContentHtml(url,doc.select("div.ina_news_contents"));
		}
		if (StringUtils.isBlank(content)) {
			content=HtmlCleaner.getContentHtml(url,doc.select("div.ina_news_pic_text"));
		}
		if (StringUtils.isBlank(content)) {
			content=HtmlCleaner.getContentHtml(url,doc.select("div.sanji_left_text_3"));
		}
		if (StringUtils.isBlank(content)) {
			content=HtmlCleaner.getContentHtml(url,doc.select("div#scroll"));
		}
		if(StringUtils.isBlank(content)){
			content=HtmlCleaner.getContentHtml(url,doc.select("div.sanji_left"));
		}
		if(StringUtils.isBlank(content)){
			content=HtmlCleaner.getContentHtml(url,doc.select(".pswp__desc"));
		}
		if(StringUtils.isBlank(content)){
			content=doc.select(".TRS_Editor").html();
		}
		String author= JSoupUtils.matchAuthor(doc, "来源：");
		meta.setTitle(title);
		meta.setContent(content);
		meta.setDate(date);
		meta.setClickNum(clickNum);
		meta.setCommentNum(commentNum);
		meta.setAuthor(author);
		/**
		 * 解析用于更新的地址
		 */
		String updateUrl=null;
		meta.setUpdateUrl(updateUrl);
		return meta;
	}
	
	public NewsMeta Update(NewsMeta meta) {

		return null;
	}
	public static void main(String[] args) {
		String url="http://m.cnr.cn/jdt/ttyc/20151021/t20151021_520215003_tt.html?tt_group_id=6207732999910326530";
		UrlMeta urlMeta=CrawlHTML.responseToURL(url);
		CnrNewsAnalyse cnhubeiNewsAnalyse=new CnrNewsAnalyse();
		boolean detailPage=cnhubeiNewsAnalyse.isDetailPage(url);
		if (detailPage) {
			NewsMeta parserHtml=cnhubeiNewsAnalyse.parserHtml(urlMeta);
			System.out.println(parserHtml);
		} else {
			System.out.println("不符合正则");
		}
	}



	
	public boolean isNeedUpdate(){
		return false;
	}
}
